/* 
    Purpose: Using the 1990 Census (1% sample), this file locates black and white men aged 
             30-50 who are fathers of a child younger than 18 in the same household. Other 
             variables necessary to create average predicted father income (in 6b) are also 
             cleaned.

    Note: Income was asked of all individuals 15+ in the 1990 Census. 

    Creates: Census1990_fathers_ages30to50.dta
*/
clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

    use ./input/Census1990_1pct_raw.dta, clear //download from IPUMS USA
        sum perwt /*Note: Respondents are assigned different weights, 
                          so all collapses must be weighted. */

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** SET UP INCOME VARIABLES
***************************

* Fix income variables 
    replace inctot=. if inctot==9999999 
    replace inctot=0 if inctot<0
    
* Family income (v1--Census variable)
    replace ftotinc=. if ftotinc==9999999 
    replace ftotinc=0 if ftotinc<0
    
* Family income (v2--manually construct by summing individual income of family unit members)
    bysort serial famunit: egen fam_income = sum(inctot)

* Count discrepancies between v1 and v2 
    /*Note: Discrepancies appear to come 
            mostly from individuals living 
            in group quarters. */      
    count if ftotinc==. & fam_income>0 
 
* Harmonize v1 and v2                  
    replace ftotinc = fam_income if ftotinc==. & fam_income>0
    
* Household income 

    //Grab one family member's income 
    sort serial famunit pernum
    by serial famunit: gen fam_head = _n==1
    
    gen temp = 0
    replace temp = ftotinc if fam_head==1 
    replace temp = 0 if ftotinc==. 
    
    //Add up incomes of "separate" families within a serial to get household income (i.e. income by serial number)
    bysort serial: egen hh_income = sum(temp)
    drop temp fam_head fam_income 
    
    rename ftotinc fam_income 

/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-)
*/
    gen CPI1990 = 130.7
    gen CPI1950= 24.1
    
    foreach var of varlist inctot fam_income hh_income {
    replace `var' = `var' * (CPI1950 / CPI1990)
    }
  
* Keep respondents with non-zero and non-missing income 
    foreach var of varlist inctot fam_income hh_income {
    drop if `var'==0 | `var'==.
    }  
    
    tempfile fulldata
    save `fulldata'

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** IDENTIFY FATHERS
***************************
    
    keep if age<18 //Restrict to children younger than 18
    keep serial poploc age
    
    replace poploc=. if poploc==0 
    drop if poploc==. //Exclude children without a father in the house
 
* Count number of kids per father
    gen tagkid =1
    bysort serial poploc: egen kidsperdad = total(tagkid)
    label var kidsperdad "Number of kids living in Census father's household"

    bysort serial poploc: keep if _n==1 //Keep all unique father ids. Some fathers will have multiple children in the Census. 
    rename poploc pernum
    drop age tagkid

    tempfile children 
    save `children'
   
* Keep the sample of fathers 
    use `fulldata', clear
    merge 1:1 serial pernum using `children'
    keep if _merge==3 
    drop _merge    

* Household size-adjusted weight
    gen wgt1990_hhsizeadj = perwt*kidsperdad
    label var wgt1990_hhsizeadj "Alternative Census weight; adjusted for # kids in father's household"

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** CREATE OTHER NECESSARY VARIABLES
***************************
  
* Keep black and white fathers ages 30 to 50
    keep if inrange(age,30,50) 
    keep if race==1 | race==2 

    tab statefip, m
    rename statefip fips
    
* Region of current residence
    * Northeast: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont, New Jersey, New York, Pennsylvania
    * Midwest: Illinois, Indiana, Michigan, Ohio, Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, South Dakota
    /* South: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, West Virginia, Alabama,
              Kentucky, Mississippi, Tennessee, Arkansas, Louisiana, Oklahoma, Texas */
    * West: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, Wyoming, California, Oregon, Washington --note: Census puts AK and HI in with Pacific division
    
    gen region_merge =.
    replace region_merge =1 if (region==11 | region==12) //Northeast
    replace region_merge =2 if (region==21 | region==22) //Midwest
    replace region_merge =3 if inrange(region,31,33) //South
    replace region_merge =4 if (region==41 | region==42) //West
    tab region, m
    tab region_merge, m

    drop if region==92 //92 =puma boundaries cross state lines

    label define region_l 1 "NORTHEAST" 2 "MIDWEST" 3 "SOUTH" 4 "WEST"
    label values region_merge region_l
    tab region_merge, m
   
    gen south_merge = region_merge==3
  
* Education variable 
    tab educd, m 
    tab educd, nol
    
    gen edu=.
    replace edu=1 if educd<20  //less than grade school (includes people with no schooling)
    replace edu=2 if educd==20 //8th grade
    replace edu=3 if inlist(educd,30,40,50,61) //<hs
    replace edu=4 if inlist(educd,62) //hs
    replace edu=5 if educd>64 & educ<999 //>hs. "999" is missing
    tab educd edu, m

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** ASSIGN COARSENED OCCS
****************

* Set up variables
    sort occ1950

* Count # of Census occupations in 1990 data
    bysort occ1950: gen nvals = _n ==1
    count if nvals==1 

* Separate people with occupations in 200's based on self-employment
    replace occ1950=occ1950+1000 if (occ1950>=200 & occ1950<=290) & classwkr==1
    
* Crosswalk Census occupations to coarsened ANES occupations
    merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
    assert _merge!=1
    drop if _merge==2
    drop _merge
    
    assert occ1950==997 if occ1950ej==. //997 = occupation missing or unknown
    drop if occ1950ej==. 
    tab occ1950 if occ1950ej==99, m nol
 
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** SAVE 
****************

* Keep relevant variables 
    rename occ1950ej fatheroccej

    keep race south_merge region_merge edu fatheroccej inctot fam_income hh_income perwt wgt* age 
 
    gen census=1
    label var census "Census obs"
    
    foreach var of varlist inctot fam_income hh_income {
    gen log_father_`var' = log(`var')
    label var log_father_`var' "Log `var', Census"
    label var `var' "`var', Census"
    }
  
    rename fam_income faminc
    rename hh_income HHinc

    compress 
    save ./output/Census1990_fathers_ages30to50.dta, replace 
                    
     